library(knitr)
Dataset_bA <- read.csv("C:/Users/Mideh/Downloads/Dataset_bA.csv")
# checking the structure and the first 7 rows of the dataset and renaming the dataset
head(Dataset_bA,7)
structure(Dataset_bA)
D_A <- Dataset_bA
# checking to see for blanks in the entire dataset and in the Cuisines column
sum(D_A == "", na.rm = TRUE)
## [1] 9
sum(D_A$Cuisines == "")
## [1] 9
There were 9 blanks in the dataset which were found in the Cuisines column
#cleaning the dataset by removing the NA
D_A[D_A == ""] <- NA
D_A_clean <- na.omit(D_A)
# loading necesaary libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
Task 1: TOP CUISINES AND THEIR PERCENTAGES
#Split the 'Cuisines' column to handle multiple cuisines per restaurant
D_A_cuisines <- D_A_clean %>%
separate_rows(Cuisines, sep = ",") %>%
mutate(Cuisines = trimws(Cuisines)) # Remove extra spaces
print(D_A_cuisines)
## # A tibble: 19,710 × 21
## Restaurant.ID Restaurant.Name Country.Code City Address Locality
## <int> <chr> <int> <chr> <chr> <chr>
## 1 6317637 Le Petit Souffle 162 Makati Ci… Third … Century…
## 2 6317637 Le Petit Souffle 162 Makati Ci… Third … Century…
## 3 6317637 Le Petit Souffle 162 Makati Ci… Third … Century…
## 4 6304287 Izakaya Kikufuji 162 Makati Ci… Little… Little …
## 5 6300002 Heat - Edsa Shangri-La 162 Mandaluyo… Edsa S… Edsa Sh…
## 6 6300002 Heat - Edsa Shangri-La 162 Mandaluyo… Edsa S… Edsa Sh…
## 7 6300002 Heat - Edsa Shangri-La 162 Mandaluyo… Edsa S… Edsa Sh…
## 8 6300002 Heat - Edsa Shangri-La 162 Mandaluyo… Edsa S… Edsa Sh…
## 9 6318506 Ooma 162 Mandaluyo… Third … SM Mega…
## 10 6318506 Ooma 162 Mandaluyo… Third … SM Mega…
## # ℹ 19,700 more rows
## # ℹ 15 more variables: Locality.Verbose <chr>, Longitude <dbl>, Latitude <dbl>,
## # Cuisines <chr>, Average.Cost.for.two <int>, Currency <chr>,
## # Has.Table.booking <chr>, Has.Online.delivery <chr>,
## # Is.delivering.now <chr>, Switch.to.order.menu <chr>, Price.range <int>,
## # Aggregate.rating <dbl>, Rating.color <chr>, Rating.text <chr>, Votes <int>
# Count occurrences of each cuisine
top_cuisines <- D_A_cuisines %>%
group_by(Cuisines) %>%
summarise(restaurant_count = n_distinct(Restaurant.ID)) %>% # Count distinct Restaurant_ID
arrange(desc(restaurant_count)) %>%
top_n(3, restaurant_count)
# Calculate percentage of restaurants serving each top cuisine
total_restaurants <- nrow(D_A_clean)
top_cuisines <- top_cuisines %>%
mutate(percentage = (restaurant_count / total_restaurants) * 100)
# View results
print(top_cuisines)
## # A tibble: 3 × 3
## Cuisines restaurant_count percentage
## <chr> <int> <dbl>
## 1 North Indian 3960 41.5
## 2 Chinese 2733 28.6
## 3 Fast Food 1986 20.8
Task 2: CITY ANALYSIS
# Step 1: Identify the City with the Highest Number of Restaurants
city_restaurant_count <- D_A_cuisines %>%
group_by(City) %>%
summarise(restaurant_count = n_distinct(Restaurant.ID)) %>% # Count distinct Restaurant_ID
arrange(desc(restaurant_count))
# Display the city with the highest number of restaurants
city_with_most_restaurants <- city_restaurant_count[1, ]
print(city_with_most_restaurants)
## # A tibble: 1 × 2
## City restaurant_count
## <chr> <int>
## 1 New Delhi 5473
# Step 2: Calculate the Average Rating for Restaurants in Each City
city_avg_rating <- D_A_cuisines %>%
group_by(City) %>%
summarise(average_rating = mean(Aggregate.rating, na.rm = TRUE)) %>%
arrange(desc(average_rating))
# Display the average rating for each city
print(city_avg_rating)
## # A tibble: 140 × 2
## City average_rating
## <chr> <dbl>
## 1 Inner City 4.9
## 2 Quezon City 4.8
## 3 Makati City 4.72
## 4 Mandaluyong City 4.6
## 5 Beechworth 4.6
## 6 Pasig City 4.53
## 7 London 4.53
## 8 Taguig City 4.53
## 9 Lincoln 4.5
## 10 Secunderabad 4.5
## # ℹ 130 more rows
# Step 3: Determine the City with the Highest Average Rating
city_with_highest_avg_rating <- city_avg_rating[1, ]
print(city_with_highest_avg_rating)
## # A tibble: 1 × 2
## City average_rating
## <chr> <dbl>
## 1 Inner City 4.9
Task 3: RANGE DISTRIBUTION
library(ggplot2)
# Step 1: Create a Bar Chart of Price Range Distribution
# Assuming the 'Price range' column is already present in your dataset
ggplot(D_A_cuisines, aes(x = Price.range)) +
geom_bar(fill = "yellow") +
labs(title = "Price Range Distribution", x = "Price Range", y = "Number of Restaurants") +
theme_minimal()
# Step 2: Calculate the Percentage of Restaurants in Each Price Range Category
price_range_distribution <- D_A_cuisines %>%
group_by(Price.range) %>%
summarise(count = n()) %>%
mutate(percentage = (count / sum(count)) * 100)
# Print the price range distribution with percentages
print(price_range_distribution)
## # A tibble: 4 × 3
## Price.range count percentage
## <int> <int> <dbl>
## 1 1 7428 37.7
## 2 2 7133 36.2
## 3 3 3758 19.1
## 4 4 1391 7.06
Task 4: ONLINE DELIVERY
# Step 1: Determine the Percentage of Restaurants that Offer Online Delivery
# Calculate the percentage of restaurants that offer online delivery
online_delivery_percentage <- D_A_cuisines %>%
group_by(Has.Online.delivery) %>%
summarise(count = n()) %>%
mutate(percentage = (count / sum(count)) * 100)
# Print the percentage of restaurants that offer online delivery
print(online_delivery_percentage)
## # A tibble: 2 × 3
## Has.Online.delivery count percentage
## <chr> <int> <dbl>
## 1 No 13909 70.6
## 2 Yes 5801 29.4
# Step 2: Compare the Average Ratings of Restaurants With and Without Online Delivery
# Calculate the average rating for restaurants with and without online delivery
avg_ratings_online_delivery <- D_A_cuisines %>%
group_by(Has.Online.delivery) %>%
summarise(average_rating = mean(Aggregate.rating, na.rm = TRUE))
# Print the comparison of average ratings
print(avg_ratings_online_delivery)
## # A tibble: 2 × 2
## Has.Online.delivery average_rating
## <chr> <dbl>
## 1 No 2.68
## 2 Yes 3.31
# Step 1 Analyze the Distribution of Aggregate Ratings
# Create a histogram of the aggregate ratings
ggplot(D_A_cuisines, aes(x = Aggregate.rating)) +
geom_histogram(binwidth = 0.5, fill = "skyblue", color = "black") +
labs(title = "Distribution of Aggregate Ratings", x = "Aggregate Rating", y = "Count of Restaurants") +
theme_minimal()
# Step 1.2: Identify the most common rating range
rating_range <- D_A_cuisines %>%
group_by(Aggregate.rating)%>%
summarise(count = n()) %>%
arrange(desc(count))
most_common_rating_range <- rating_range[1,]
print(most_common_rating_range)
## # A tibble: 1 × 2
## Aggregate.rating count
## <dbl> <int>
## 1 0 3443
# Step 2: Calculate the Average Number of Votes Received by Restaurants
# Calculate the average number of votes
Average_votes <- D_A_cuisines %>%
summarise(average_votes = mean(Votes, na.rm = TRUE))
# Step 1: Identify the Most Common Combinations of Cuisines
# Count how many times each cuisine combination appears
Common_cuisines_combination <- D_A_cuisines %>%
group_by(Cuisines)%>%
summarise(count = n())%>%
arrange(desc(count))
print(Common_cuisines_combination)
## # A tibble: 145 × 2
## Cuisines count
## <chr> <int>
## 1 North Indian 3960
## 2 Chinese 2735
## 3 Fast Food 1986
## 4 Mughlai 995
## 5 Italian 764
## 6 Bakery 745
## 7 Continental 736
## 8 Cafe 703
## 9 Desserts 653
## 10 South Indian 636
## # ℹ 135 more rows
library(sf)
## Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.3.1; sf_use_s2() is TRUE
# Step 2: Determine if Certain Cuisine Combinations Tend to Have Higher Ratings
# Calculate the average aggregate rating for each cuisine combination
cuisines_combinations_ratings <- D_A_cuisines %>%
group_by(Cuisines)%>%
summarise(Avg_rating = mean(Aggregate.rating, count = n()))%>%
arrange(desc(Avg_rating))
# location of restaurants on a map
# Convert to sf object using longitude and latitude
restaurants_sf <- st_as_sf(D_A_cuisines, coords = c("Longitude", "Latitude"), crs = 4326)
# Create the plot
ggplot() +
geom_sf(data = restaurants_sf) +
theme_minimal() +
labs(title = "Restaurant Locations", x = "Longitude", y = "Latitude")
# Step 1: Identify restaurant chains (restaurants with multiple locations)
restaurant_chains <- D_A_cuisines %>%
group_by(Restaurant.Name) %>%
summarise(
location_count = n_distinct(Restaurant.ID), # Count distinct Restaurant IDs
avg_rating = mean(Aggregate.rating), # Calculate average rating
total_votes = sum(Votes) # Sum of votes (popularity)
) %>%
filter(location_count > 1) # Keep only restaurant chains (more than 1 location)
# Step 2: View the result
print(restaurant_chains)
## # A tibble: 734 × 4
## Restaurant.Name location_count avg_rating total_votes
## <chr> <int> <dbl> <int>
## 1 10 Downing Street 2 4 1340
## 2 221 B Baker Street 3 3.37 215
## 3 34 Parkstreet Lane 2 3.05 31
## 4 34, Chowringhee Lane 12 2.39 777
## 5 4700BC Popcorn 2 3.5 176
## 6 6 Pack Momos 2 1.4 8
## 7 A Piece of Paris 2 3.75 162
## 8 AB's - Absolute Barbecues 4 4.82 40200
## 9 AB's Absolute Barbecues 2 4.85 6302
## 10 Aap Ki Khatir 2 0 0
## # ℹ 724 more rows
# You can sort by average rating or total votes to find the most popular chains
top_rated_chains <- restaurant_chains %>%
arrange(desc(avg_rating)) # Sort by average rating
print(top_rated_chains)
## # A tibble: 734 × 4
## Restaurant.Name location_count avg_rating total_votes
## <chr> <int> <dbl> <int>
## 1 Talaga Sampireun 3 4.9 11028
## 2 AB's Absolute Barbecues 2 4.85 6302
## 3 Silantro Fil-Mex 2 4.85 2728
## 4 AB's - Absolute Barbecues 4 4.82 40200
## 5 Naturals Ice Cream 2 4.8 3094
## 6 Gymkhana 2 4.7 756
## 7 The Cheesecake Factory 2 4.65 6020
## 8 Dishoom 2 4.61 4771
## 9 Chili's 5 4.6 30215
## 10 Garota de Ipanema 2 4.6 118
## # ℹ 724 more rows
most_popular_chains <- restaurant_chains %>%
arrange(desc(total_votes)) # Sort by total votes
print(most_popular_chains)
## # A tibble: 734 × 4
## Restaurant.Name location_count avg_rating total_votes
## <chr> <int> <dbl> <int>
## 1 Barbeque Nation 26 4.33 58631
## 2 Big Chill 4 4.47 43412
## 3 AB's - Absolute Barbecues 4 4.82 40200
## 4 Tea Villa Cafe 4 3.92 31002
## 5 Chili's 5 4.6 30215
## 6 Truffles 2 4.32 29016
## 7 Haldiram's 16 3.63 28445
## 8 Pirates of Grill 4 4.03 27342
## 9 Subway 63 2.91 24496
## 10 Out Of The Box 2 3.89 23209
## # ℹ 724 more rows